About Document

An example of treating and ploting data from indeed after scrapping (code in https://github.com/henrique1837/indeed_scraper)

Treating data

library(plyr)
library(stringr)
library(ggplot2)
library(plotly)
library(leaflet)
#### Read data ####
files <- Sys.glob("./results/*")
files <- files[which(str_detect(string = files,pattern = "indeed")==TRUE)]
df_t <- data.frame()
for(i in 1:length(files)){
  df <- read.csv(file = files[i],
                 stringsAsFactors = FALSE)
  df <- df[which(!(str_detect(string = df$date,
                              pattern = "30+"))),]
  file_date <- as.Date(str_extract(string = files[i],
                       pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}"))
  df$date <- file_date - as.numeric(str_extract(string = df$date,
                                                pattern = "[[:digit:]]+"))
  df$date[which(is.na(df$date))] <- file_date
  if(length(df$country) == 0){
    df$country <- "USA"
  }
  df_t <- rbind(df_t,df[which(!(df$link %in% df_t$link)),])
  #message("Files ",i," - ",length(files))
}
df_t$city <- gsub(pattern = "[[:punct:]].*",
                       replacement = "",
                       x = df_t$location)
df_t$state <- str_extract(string = df_t$location,
                          pattern = "[A-Z]{2}")
df_t$count <- 1
df_agreggated <- ddply(.data = df_t,
                       .variables = .(date),
                       .fun = summarize,
                       totalJobs=sum(count))

df_companies_date <- ddply(.data = df_t,
                      .variables = .(date,company),
                      .fun = summarize,
                      totalJobs=sum(count))

df_companies <- ddply(.data = df_t,
                      .variables = .(company),
                      .fun = summarize,
                      totalJobs=sum(count))

df_places <- ddply(.data = df_t,
                   .variables = .(city,state,country),
                   .fun = summarize,
                   totalJobs=sum(count))
# Download lat and long of USA cities (source https://simplemaps.com/data/us-cities)
if(!file.exists("./results/USA_lat_long.csv")){
  download.file(url = "https://simplemaps.com/static/data/us-cities/uscitiesv1.4.csv",
              destfile = "./results/USA_lat_long.csv" )
}
df_lat_long_cities <- read.csv(file = "./results/USA_lat_long.csv",
                               stringsAsFactors = FALSE)
## Preparing dataframe for leaflet map ##
lats <- numeric()
longs <- numeric()
cities <- character()
totalJobs <- numeric()
df_placesUSA <- df_places[which(df_places$country=="USA"),]
for(i in 1:nrow(df_placesUSA)){
  
  indice <- which(toupper(df_lat_long_cities$city) == toupper(df_placesUSA$city[i]))
  if(length(indice)!=0){
    if(length(indice)>1){
      for(ind in indice){
        if(toupper(df_lat_long_cities$state_id[ind]) == toupper(df_placesUSA$state[i])){
          indice <- ind
        }
      }
    }
    totalJobs[i] <- df_placesUSA$totalJobs[i]
    cities[i] <- df_placesUSA$city[i]
    lats[i] <- df_lat_long_cities$lat[indice]
    longs[i] <- df_lat_long_cities$lng[indice]
  }
  
}
## Warning in lats[i] <- df_lat_long_cities$lat[indice]: número de itens para
## para substituir não é um múltiplo do comprimento do substituto
## Warning in longs[i] <- df_lat_long_cities$lng[indice]: número de itens para
## para substituir não é um múltiplo do comprimento do substituto
df_map <- data.frame(city=cities,
                     lat=lats,
                     lng=longs,
                     totalJobs=totalJobs)

Data Information

Range of dates

## [1] "2018-12-29" "2019-02-15"

Total Observations

## [1] 247

Total companies listed

## [1] 85

Total localizations listed

## [1] 50

Total localizations listed in USA

## [1] 43

Graphics

Total Ethereum Jobs posted per date

Total Ethereum Jobs posted by top 10 company

Total Ethereum Jobs posted in top 10 localizations

Map of Ethereum Jobs locations posted in USA

## Warning in validateCoords(lng, lat, funcName): Data contains 4 rows with
## either missing or invalid lat/lon values and will be ignored

Total Ethereum Jobs posted by top 10 companies per date

Session

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.5 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=pt_PT.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=pt_BR.UTF-8        LC_COLLATE=pt_PT.UTF-8    
##  [5] LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=pt_PT.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2.2 leaflet_2.0.2  plotly_4.8.0   ggplot2_3.1.0 
## [5] stringr_1.3.1  plyr_1.8.4    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0        later_0.7.5       pillar_1.3.1     
##  [4] compiler_3.4.4    bindr_0.1.1       tools_3.4.4      
##  [7] digest_0.6.18     viridisLite_0.3.0 jsonlite_1.6     
## [10] evaluate_0.12     tibble_2.0.1      gtable_0.2.0     
## [13] pkgconfig_2.0.2   rlang_0.3.1       shiny_1.2.0      
## [16] crosstalk_1.0.0   yaml_2.2.0        xfun_0.4         
## [19] withr_2.1.2       dplyr_0.7.8       httr_1.4.0       
## [22] knitr_1.21        htmlwidgets_1.3   grid_3.4.4       
## [25] tidyselect_0.2.5  glue_1.3.0        data.table_1.12.0
## [28] R6_2.3.0          rmarkdown_1.11    tidyr_0.8.2      
## [31] purrr_0.3.0       magrittr_1.5      promises_1.0.1   
## [34] scales_1.0.0      htmltools_0.3.6   assertthat_0.2.0 
## [37] xtable_1.8-3      mime_0.6          colorspace_1.4-0 
## [40] httpuv_1.4.5.1    labeling_0.3      stringi_1.2.4    
## [43] lazyeval_0.2.1    munsell_0.5.0     crayon_1.3.4